### ####################################################
### Creates a comparison with (national) and local polls
### ####################################################

here::i_am("newR/07_compare_to_polling.R")

library(here)
library(tidyverse)

### Compare to by-election polls
polls <- read.csv(here::here("data",
                             "by-election-polls.csv"))

pred <- as.matrix(polls[,c("Con", "Lab", "Lib", "Nat")])
actual <- as.matrix(polls[,c("ConActual", "LabActual", "LibActual", "NatActual")])
errs <- abs(pred - actual)


### 3.6
mean_ae <- mean(errs, na.rm = TRUE)
### 2.7
med_ae <- median(errs, na.rm = TRUE)

### Correctly predicted
pcp <- mean(apply(actual, 1, which.max) == apply(pred, 1, which.max))

mean(errs[,1], na.rm = TRUE)
mean(errs[,2], na.rm = TRUE)
mean(errs[,3], na.rm = TRUE)
mean(errs[,4], na.rm = TRUE)


### What's the calibration?
### Suppose we assume a MoE of 3.3 (sample size of 855)
hi <- pred + 3.3
lo <- pred - 3.3


calib <- mean(actual > lo & actual < hi, na.rm = TRUE)

out <- data.frame(Statistic = c("Seats correctly predicted",
                                "Multiclass Brier score",
                                "Mean absolute error",
                                "Median absolute error",
                                "Predictions inside 95% interval"),
                  `Local polls` = c(100 * pcp,
                                  NA,
                                  mean_ae,
                                  med_ae,
                                  100 * calib))

saveRDS(out,
        file = here::here("working",
                          "local-poll-perf.rds"))

### Compare to national polls
source(here::here("newR", "00_helpers.R"))

dat <- readRDS(here::here("working",
                          "tidy_model_data.rds"))


pvars <- c("Con_BE", "Lab_BE", "Lib_BE", "Nat_BE", "Oth2_BE")
for (p in pvars) { 
    dat[,p] <- replace(dat[,p],
                       dat[,p] <= 0,
                       1 / 40000)
}

### Make sure everything sums to 1
dat[,pvars] <- dat[,pvars] / rowSums(dat[,pvars])

dat$Con.pred <- dat$Con_GE + dat$ConPollChg
dat$Lab.pred <- dat$Lab_GE + dat$LabPollChg
dat$Lib.pred <- dat$Lib_GE + dat$LibPollChg
dat$Nat.pred <- dat$Nat_GE + dat$NatPollChg
dat$Oth.pred <- dat$Oth_GE + dat$OthPollChg

dat$Con.pred <- dat$Con.pred * dat$ConCand_BE
dat$Lab.pred <- dat$Lab.pred * dat$LabCand_BE
dat$Lib.pred <- dat$Lib.pred * dat$LibCand_BE
dat$Nat.pred <- dat$Nat.pred * dat$NatCand_BE
dat$Oth.pred <- dat$Oth.pred * dat$OthCand_BE

yvars <- c("Con.pred", "Lab.pred", "Lib.pred", "Nat.pred", "Oth.pred")
dat[,yvars] <- dat[,yvars] / rowSums(dat[,yvars])

pred <- as.matrix(dat[,yvars])
actual <- as.matrix(dat[,pvars])

pcp <- mean(apply(actual, 1, which.max) == apply(pred, 1, which.max))

### Compare with "same party won as last time"
pred <- as.matrix(dat[,paste0(c("Con", "Lab", "Lib", "Nat", "Oth"), "_GE")])
pcp <- mean(apply(actual, 1, which.max) == apply(pred, 1, which.max))

errs <- abs(pred - actual)
mean_ae <- mean(unlist(errs))
### 9.2
med_ae <- median(unlist(errs))

lo <- pred - 0.03
hi <- pred + 0.03

calib <- mean(actual > lo & actual < hi, na.rm = TRUE)

out <- data.frame(Statistic = c("Seats correctly predicted",
                                "Multiclass Brier score",
                                "Mean absolute error",
                                "Median absolute error",
                                "Predictions inside 95% interval"),
                  `National polls` = c(100 * pcp,
                                  NA,
                                  100 * mean_ae,
                                  100 * med_ae,
                                  100 * calib))

saveRDS(out,
        file = here::here("working",
                          "national-poll-perf.rds"))
